package cn.fuzongyao.crawler.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.nio.charset.Charset;

/**
 * @author fuzongyao
 * @date 2020/09/21
 * @since 1.0.0
 */
@Slf4j
public class JsoupUtils {

    public static Document parse(String url, Charset charset) {
        if (url == null || url.isEmpty()) {
            return null;
        }

        //建立一个新的请求客户端
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //使用HttpGet的方式请求网址
        HttpGet httpGet = new HttpGet(url);
        httpGet.setConfig(RequestConfig.custom()
                .setConnectionRequestTimeout(500)
                .setConnectTimeout(500)
                .setSocketTimeout(500).build());
        for (int i = 0; i < 3; i++) {
            try {
                // 获取网址的返回结果
                CloseableHttpResponse response = httpClient.execute(httpGet);
                // 获取返回结果中的实体
                HttpEntity entity = response.getEntity();
                String html = EntityUtils.toString(entity, charset);
                Document document = Jsoup.parse(html);
                document.setBaseUri(url);
                return document;
            } catch (Exception e) {
                log.error("retry url: " + url);
            }
        }
        log.error("获取数据异常，url：" + url);
        return null;
    }
}
